!mkdir -p data
!wget https://www.dropbox.com/s/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz?dl=0 -O data/data_tutorial_buenrostro.tar.gz
!cd data; tar -xzf data_tutorial_buenrostro.tar.gz
!mkdir -p write
--2021-07-21 22:01:51-- https://www.dropbox.com/s/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz?dl=0 Resolving www.dropbox.com (www.dropbox.com)... 162.125.69.18, 2620:100:6022:18::a27d:4212 Connecting to www.dropbox.com (www.dropbox.com)|162.125.69.18|:443... connected. HTTP request sent, awaiting response... 301 Moved Permanently Location: /s/raw/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz [following] --2021-07-21 22:01:51-- https://www.dropbox.com/s/raw/cwlaaxl70t27tb2/data_tutorial_buenrostro.tar.gz Reusing existing connection to www.dropbox.com:443. HTTP request sent, awaiting response... 302 Found Location: https://ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com/cd/0/inline/BSuXaZQpEUw_qxK18ofp45jQqpkiTD3z_20gzER0LGOVbZ3owCPwex_uWXif-SQmMhxRgeL2P63szgr3l_JGryfKLzBkxqTVN54CpX0Ri3znT5my_kpAwjN4M75_2fM-BjSrGSSQDq3i4Mmh6WBY_snH/file# [following] --2021-07-21 22:01:51-- https://ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com/cd/0/inline/BSuXaZQpEUw_qxK18ofp45jQqpkiTD3z_20gzER0LGOVbZ3owCPwex_uWXif-SQmMhxRgeL2P63szgr3l_JGryfKLzBkxqTVN54CpX0Ri3znT5my_kpAwjN4M75_2fM-BjSrGSSQDq3i4Mmh6WBY_snH/file Resolving ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com (ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com)... 162.125.69.15, 2620:100:6022:15::a27d:420f Connecting to ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com (ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com)|162.125.69.15|:443... connected. HTTP request sent, awaiting response... 302 Found Location: /cd/0/inline2/BSuytoBKPvTcseJbLM25w_Pg54-s8Bc0fK6WvPvOeOXtqFEQD8vQCTvOS-keSud328-sw7lwNPQ9unOVIPFMBUuhzF-ZxlL5Q2GTkfQyoW_fy28ZZQOKA9bjBp6gdxnBOakMyqPikVd0PdjR7RxiNtts7YoiUdqvCHTmuuczyPEGi81vU4dfgxRTJWlPFiBLjmVqmAzbmlUEGUeR7m9qg-7cAdGDV-5ze_b3i4KTqnnqbK6CDolEX6COJ02QhX1LmHERSF6ememwrWZmBLZM9nZe1T0y1bbN6DhuPAjl1kY1HsTSyRnQORhprDfwm0_lpeHI1Ocj4PdO7IrAhKDO-nPpRFyzfhI-GdX1UCby96C_2mJSNHgtN2SjclXRC1TsK38/file [following] --2021-07-21 22:01:52-- https://ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com/cd/0/inline2/BSuytoBKPvTcseJbLM25w_Pg54-s8Bc0fK6WvPvOeOXtqFEQD8vQCTvOS-keSud328-sw7lwNPQ9unOVIPFMBUuhzF-ZxlL5Q2GTkfQyoW_fy28ZZQOKA9bjBp6gdxnBOakMyqPikVd0PdjR7RxiNtts7YoiUdqvCHTmuuczyPEGi81vU4dfgxRTJWlPFiBLjmVqmAzbmlUEGUeR7m9qg-7cAdGDV-5ze_b3i4KTqnnqbK6CDolEX6COJ02QhX1LmHERSF6ememwrWZmBLZM9nZe1T0y1bbN6DhuPAjl1kY1HsTSyRnQORhprDfwm0_lpeHI1Ocj4PdO7IrAhKDO-nPpRFyzfhI-GdX1UCby96C_2mJSNHgtN2SjclXRC1TsK38/file Reusing existing connection to ucbb7f337767264f0b3bcf9f33e5.dl.dropboxusercontent.com:443. HTTP request sent, awaiting response... 200 OK Length: 30829526 (29M) [application/octet-stream] Saving to: ‘data/data_tutorial_buenrostro.tar.gz’ data/data_tutorial_ 100%[===================>] 29.40M 14.1MB/s in 2.1s 2021-07-21 22:01:55 (14.1 MB/s) - ‘data/data_tutorial_buenrostro.tar.gz’ saved [30829526/30829526]
## load library
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import episcanpy.api as epi
import scopen
sc.settings.set_figure_params(dpi=80, color_map='gist_earth')
results_file = 'write/buenrostro_pbmc.h5ad' # the file that will store the analysis results
adata = ad.read('data/data_tutorial_buenrostro/all_buenrostro_bulk_peaks.h5ad')
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name'
# display information stored in adata.obs
adata.obs
| batch | cell_name | |
|---|---|---|
| BM1077-MPP-Frozen-160105-1.dedup.st.bam | 0 | BM1077-MPP-Frozen-160105-1 |
| singles-20160726-scATAC-BM1137-GMP3high-HYC-88.dedup.st.bam | 0 | singles-20160726-scATAC-BM1137-GMP3high-HYC-88 |
| singles-160808-scATAC-BM1137-GMP2mid-LS-65.dedup.st.bam | 0 | singles-160808-scATAC-BM1137-GMP2mid-LS-65 |
| singles-BM0828-LMPP-frozen-151105-20.dedup.st.bam | 0 | singles-BM0828-LMPP-frozen-151105-20 |
| singles-160819-BM1137-CMP-LS-95.dedup.st.bam | 0 | singles-160819-BM1137-CMP-LS-95 |
| ... | ... | ... |
| BM1077-LMPP-Frozen-160107-40.dedup.st.bam | 1 | BM1077-LMPP-Frozen-160107-40 |
| BM1077-MPP-Frozen-160105-74.dedup.st.bam | 1 | BM1077-MPP-Frozen-160105-74 |
| singles-BM1214-GMP-160421-9.dedup.st.bam | 1 | singles-BM1214-GMP-160421-9 |
| singles-BM0828-LMPP-frozen-151105-62.dedup.st.bam | 1 | singles-BM0828-LMPP-frozen-151105-62 |
| singles-BM0828-MEP-160420-43.dedup.st.bam | 1 | singles-BM0828-MEP-160420-43 |
2034 rows × 2 columns
# checking the variable names (bulk peaks coordinates)
print(adata.var_names)
Index(['chr1_10279_10779', 'chr1_13252_13752', 'chr1_16019_16519',
'chr1_29026_29526', 'chr1_96364_96864', 'chr1_115440_115940',
'chr1_237535_238035', 'chr1_240811_241311', 'chr1_540469_540969',
'chr1_713909_714409',
...
'chrX_154822578_154823078', 'chrX_154840821_154841321',
'chrX_154841449_154841949', 'chrX_154841956_154842456',
'chrX_154842517_154843017', 'chrX_154862057_154862557',
'chrX_154870909_154871409', 'chrX_154880741_154881241',
'chrX_154891824_154892324', 'chrX_154896342_154896842'],
dtype='object', name='index', length=491436)
adata.obs['facs_label'] = ['MEP' if 'MEP' in line else line.split('.bam')[0].lstrip('singles-').split('BM')[-1].split('-')[1] for line in adata.obs['cell_name'].tolist()]
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name', 'facs_label'
!head 'data/data_tutorial_buenrostro/metadata.tsv'
label cell_type BM1077-CLP-Frozen-160106-13 CLP BM1077-CLP-Frozen-160106-14 CLP BM1077-CLP-Frozen-160106-2 CLP BM1077-CLP-Frozen-160106-21 CLP BM1077-CLP-Frozen-160106-27 CLP BM1077-CLP-Frozen-160106-3 CLP BM1077-CLP-Frozen-160106-36 CLP BM1077-CLP-Frozen-160106-42 CLP BM1077-CLP-Frozen-160106-44 CLP
# format the cell names to match the metadata file
adata.obs_names = [x.split('/')[-1].split('.dedup.st.bam')[0] for x in adata.obs_names.tolist()]
adata.obs_names
Index(['BM1077-MPP-Frozen-160105-1',
'singles-20160726-scATAC-BM1137-GMP3high-HYC-88',
'singles-160808-scATAC-BM1137-GMP2mid-LS-65',
'singles-BM0828-LMPP-frozen-151105-20',
'singles-160819-BM1137-CMP-LS-95', 'BM1077-MPP-Frozen-160105-36',
'singles-20160726-scATAC-BM1214-CMP-LS-84',
'BM1077-CMP-Frozen-160106-21', 'singles-BM0106-HSC-SIM-160219-36',
'singles-BM1214-GMP-160421-60',
...
'singles-160822-BM1137-CMP-LS-91',
'singles-BM0828-CMP-frozen-151118-69',
'singles-20160617-scATAC-BM1214-CMP-LS-40',
'singles-BM0828-GMP-151027-2',
'singles-20160726-scATAC-BM1214-CMP-LS-19',
'BM1077-LMPP-Frozen-160107-40', 'BM1077-MPP-Frozen-160105-74',
'singles-BM1214-GMP-160421-9', 'singles-BM0828-LMPP-frozen-151105-62',
'singles-BM0828-MEP-160420-43'],
dtype='object', length=2034)
epi.pp.load_metadata(adata,
'data/data_tutorial_buenrostro/metadata.tsv',
separator='\t')
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type'
# check the 2 different annotation obtained
pd.crosstab(adata.obs['cell_type'], adata.obs['facs_label'])
| facs_label | CLP | CMP | GMP | GMP1low | GMP2mid | GMP3high | HSC | LMPP | MCP | MEP | MPP | UNK | mono | pDC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| cell_type | ||||||||||||||
| CLP | 78 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| CMP | 0 | 502 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| GMP | 0 | 0 | 216 | 68 | 44 | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| HSC | 0 | 0 | 0 | 0 | 0 | 0 | 347 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| LMPP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 160 | 0 | 0 | 0 | 0 | 0 | 0 |
| MEP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 138 | 0 | 0 | 0 | 0 |
| MPP | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 142 | 0 | 0 | 0 |
| UNK | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 60 | 0 | 0 |
| mono | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 64 | 0 |
| pDC | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 73 | 0 | 0 | 0 | 0 | 68 |
Download annotation file (the data are aligned on hg19)
!wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz -O data/data_tutorial_buenrostro/gencode.v19.annotation.gtf.gz
!cd data/data_tutorial_buenrostro; gunzip gencode.v19.annotation.gtf
--2021-07-21 22:01:59-- ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
=> ‘data/data_tutorial_buenrostro/gencode.v19.annotation.gtf.gz’
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.197.74
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.197.74|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done. ==> PWD ... done.
==> TYPE I ... done. ==> CWD (1) /pub/databases/gencode/Gencode_human/release_19 ... done.
==> SIZE gencode.v19.annotation.gtf.gz ... 37991892
==> PASV ... done. ==> RETR gencode.v19.annotation.gtf.gz ... done.
Length: 37991892 (36M) (unauthoritative)
gencode.v19.annotat 100%[===================>] 36.23M 44.4MB/s in 0.8s
2021-07-21 22:02:00 (44.4 MB/s) - ‘data/data_tutorial_buenrostro/gencode.v19.annotation.gtf.gz’ saved [37991892]
gzip: gencode.v19.annotation.gtf: unknown suffix -- ignored
epi.tl.find_genes(adata,
gtf_file='data/data_tutorial_buenrostro/gencode.v19.annotation.gtf',
key_added='transcript_annotation',
upstream=2000,
feature_type='transcript',
annotation='HAVANA',
raw=False)
adata
AnnData object with n_obs × n_vars = 2034 × 491436
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type'
var: 'transcript_annotation'
Check if the data matrix is binary - if not, binarize the data matrix
print(np.max(adata.X))
1.0
# remove any potential empty features or barcodes
epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
# filtered out 24210 peaks
adata
AnnData object with n_obs × n_vars = 2034 × 467226
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features'
var: 'transcript_annotation', 'n_cells'
adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
adata
AnnData object with n_obs × n_vars = 2034 × 467226
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells'
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/anndata/_core/anndata.py:1220: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. ... storing 'facs_label' as categorical FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/anndata/_core/anndata.py:1220: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. ... storing 'cell_type' as categorical FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/anndata/_core/anndata.py:1220: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. ... storing 'transcript_annotation' as categorical
# set a minimum number of cells to keep
min_features = 1000
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
threshold=min_features, save='Buenrostro_bulk_peaks_coverage_cells.png')
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50,
threshold=min_features, save='Buenrostro_bulk_peaks_coverage_cells_log10.png')
# minimum number of cells sharing a feature
min_cells = 5
epi.pp.coverage_features(adata, binary=True, log=False,
threshold=min_cells, save='Buenrostro_bulk_peaks_coverage_peaks.png')
epi.pp.coverage_features(adata, binary=True, log=True,
threshold=min_cells, save='Buenrostro_bulk_peaks_coverage_peaks_log10.png')
min_features = 1000
epi.pp.filter_cells(adata, min_features=min_features)
adata
AnnData object with n_obs × n_vars = 1945 × 467226
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness'
min_cells = 5
epi.pp.filter_features(adata, min_cells=min_cells)
adata
AnnData object with n_obs × n_vars = 1945 × 311035
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness'
epi.pp.coverage_cells(adata, binary=True, log='log10', bins=50, threshold=min_features)
epi.pp.coverage_features(adata, binary=True, log='log10', bins=50, threshold=min_cells)
We aim to select a cuttof after the elbow.
epi.pp.cal_var(adata)
FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/seaborn/distributions.py:2557: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). FutureWarning:/home/rs619065/miniconda3/envs/scopen/lib/python3.8/site-packages/seaborn/distributions.py:2557: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
min_score_value = 0.515
nb_feature_selected = 120000
epi.pl.variability_features(adata,log=None,
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix.png')
epi.pl.variability_features(adata,log='log10',
min_score=min_score_value, nb_features=nb_feature_selected,
save='variability_features_plot_bonemarrow_peakmatrix_log10.png')
# save the current matrix in the raw layer
adata.raw = adata
# create a new AnnData containing only the most variable features
adata = epi.pp.select_var_feature(adata,
nb_features=nb_feature_selected,
show=False,
copy=True)
adata
View of AnnData object with n_obs × n_vars = 1945 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
adata
View of AnnData object with n_obs × n_vars = 1945 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
epi.pp.filter_cells(adata, min_features=2000)
epi.pp.filter_cells(adata, max_features=25000)
Trying to set attribute `.obs` of view, copying.
adata
AnnData object with n_obs × n_vars = 1722 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
from scopen.Main import scopen_dr
adata.obsm['X_pca'] = np.transpose(scopen_dr(np.transpose(adata.X)))
07/21/2021 22:03:32, running tf-idf transformation... 07/21/2021 22:03:46, violation: 1.00000000 07/21/2021 22:03:46, violation: 0.23809547 07/21/2021 22:03:46, violation: 0.15862548 07/21/2021 22:03:47, violation: 0.10376727 07/21/2021 22:03:47, violation: 0.06349803 07/21/2021 22:03:48, violation: 0.04762651 07/21/2021 22:03:48, violation: 0.04229391 07/21/2021 22:03:49, violation: 0.03723540 07/21/2021 22:03:49, violation: 0.03254635 07/21/2021 22:03:49, violation: 0.02844828 07/21/2021 22:03:50, violation: 0.02518969 07/21/2021 22:03:50, violation: 0.02276936 07/21/2021 22:03:51, violation: 0.02096878 07/21/2021 22:03:51, violation: 0.01953172 07/21/2021 22:03:51, violation: 0.01826928 07/21/2021 22:03:52, violation: 0.01709340 07/21/2021 22:03:52, violation: 0.01597418 07/21/2021 22:03:52, violation: 0.01491398 07/21/2021 22:03:53, violation: 0.01391376 07/21/2021 22:03:53, violation: 0.01298998 07/21/2021 22:03:54, violation: 0.01215548 07/21/2021 22:03:54, violation: 0.01141767 07/21/2021 22:03:54, violation: 0.01077700 07/21/2021 22:03:55, violation: 0.01021965 07/21/2021 22:03:55, violation: 0.00973250 07/21/2021 22:03:56, violation: 0.00930043 07/21/2021 22:03:56, violation: 0.00890897 07/21/2021 22:03:56, violation: 0.00855066 07/21/2021 22:03:57, violation: 0.00821511 07/21/2021 22:03:57, violation: 0.00790385 07/21/2021 22:03:58, violation: 0.00761203 07/21/2021 22:03:58, violation: 0.00733954 07/21/2021 22:03:58, violation: 0.00708217 07/21/2021 22:03:59, violation: 0.00683490 07/21/2021 22:03:59, violation: 0.00659618 07/21/2021 22:04:00, violation: 0.00636058 07/21/2021 22:04:00, violation: 0.00612942 07/21/2021 22:04:00, violation: 0.00589915 07/21/2021 22:04:01, violation: 0.00566925 07/21/2021 22:04:01, violation: 0.00543857 07/21/2021 22:04:02, violation: 0.00521031 07/21/2021 22:04:02, violation: 0.00498624 07/21/2021 22:04:02, violation: 0.00476971 07/21/2021 22:04:03, violation: 0.00455888 07/21/2021 22:04:03, violation: 0.00435613 07/21/2021 22:04:04, violation: 0.00416258 07/21/2021 22:04:04, violation: 0.00397863 07/21/2021 22:04:04, violation: 0.00380585 07/21/2021 22:04:05, violation: 0.00364409 07/21/2021 22:04:05, violation: 0.00349495 07/21/2021 22:04:06, violation: 0.00335874 07/21/2021 22:04:06, violation: 0.00323439 07/21/2021 22:04:06, violation: 0.00311983 07/21/2021 22:04:07, violation: 0.00301411 07/21/2021 22:04:07, violation: 0.00291534 07/21/2021 22:04:08, violation: 0.00282238 07/21/2021 22:04:08, violation: 0.00273499 07/21/2021 22:04:08, violation: 0.00265252 07/21/2021 22:04:09, violation: 0.00257507 07/21/2021 22:04:09, violation: 0.00250194 07/21/2021 22:04:10, violation: 0.00243270 07/21/2021 22:04:10, violation: 0.00236747 07/21/2021 22:04:10, violation: 0.00230589 07/21/2021 22:04:11, violation: 0.00224785 07/21/2021 22:04:11, violation: 0.00219304 07/21/2021 22:04:12, violation: 0.00214112 07/21/2021 22:04:12, violation: 0.00209175 07/21/2021 22:04:12, violation: 0.00204503 07/21/2021 22:04:13, violation: 0.00200065 07/21/2021 22:04:13, violation: 0.00195797 07/21/2021 22:04:14, violation: 0.00191670 07/21/2021 22:04:14, violation: 0.00187690 07/21/2021 22:04:14, violation: 0.00183782 07/21/2021 22:04:15, violation: 0.00179931 07/21/2021 22:04:15, violation: 0.00176117 07/21/2021 22:04:16, violation: 0.00172332 07/21/2021 22:04:16, violation: 0.00168517 07/21/2021 22:04:16, violation: 0.00164690 07/21/2021 22:04:17, violation: 0.00160820 07/21/2021 22:04:17, violation: 0.00156930 07/21/2021 22:04:18, violation: 0.00153021 07/21/2021 22:04:18, violation: 0.00149059 07/21/2021 22:04:18, violation: 0.00145038 07/21/2021 22:04:19, violation: 0.00140987 07/21/2021 22:04:19, violation: 0.00136918 07/21/2021 22:04:20, violation: 0.00132823 07/21/2021 22:04:20, violation: 0.00128681 07/21/2021 22:04:20, violation: 0.00124540 07/21/2021 22:04:21, violation: 0.00120380 07/21/2021 22:04:21, violation: 0.00116222 07/21/2021 22:04:21, violation: 0.00112061 07/21/2021 22:04:22, violation: 0.00107984 07/21/2021 22:04:22, violation: 0.00104014 07/21/2021 22:04:23, violation: 0.00100163 07/21/2021 22:04:23, violation: 0.00096499 07/21/2021 22:04:23, violation: 0.00093034 07/21/2021 22:04:24, violation: 0.00089803 07/21/2021 22:04:24, violation: 0.00086791 07/21/2021 22:04:25, violation: 0.00084000 07/21/2021 22:04:25, violation: 0.00081409 07/21/2021 22:04:25, violation: 0.00078986 07/21/2021 22:04:26, violation: 0.00076719 07/21/2021 22:04:26, violation: 0.00074589 07/21/2021 22:04:27, violation: 0.00072579 07/21/2021 22:04:27, violation: 0.00070673 07/21/2021 22:04:27, violation: 0.00068862 07/21/2021 22:04:28, violation: 0.00067129 07/21/2021 22:04:28, violation: 0.00065468 07/21/2021 22:04:29, violation: 0.00063877 07/21/2021 22:04:29, violation: 0.00062346 07/21/2021 22:04:29, violation: 0.00060870 07/21/2021 22:04:30, violation: 0.00059447 07/21/2021 22:04:30, violation: 0.00058075 07/21/2021 22:04:31, violation: 0.00056743 07/21/2021 22:04:31, violation: 0.00055454 07/21/2021 22:04:31, violation: 0.00054205 07/21/2021 22:04:32, violation: 0.00052990 07/21/2021 22:04:32, violation: 0.00051814 07/21/2021 22:04:33, violation: 0.00050670 07/21/2021 22:04:33, violation: 0.00049561 07/21/2021 22:04:33, violation: 0.00048484 07/21/2021 22:04:34, violation: 0.00047441 07/21/2021 22:04:34, violation: 0.00046425 07/21/2021 22:04:35, violation: 0.00045440 07/21/2021 22:04:35, violation: 0.00044486 07/21/2021 22:04:35, violation: 0.00043552 07/21/2021 22:04:36, violation: 0.00042647 07/21/2021 22:04:36, violation: 0.00041770 07/21/2021 22:04:37, violation: 0.00040919 07/21/2021 22:04:37, violation: 0.00040091 07/21/2021 22:04:37, violation: 0.00039288 07/21/2021 22:04:38, violation: 0.00038507 07/21/2021 22:04:38, violation: 0.00037749 07/21/2021 22:04:39, violation: 0.00037014 07/21/2021 22:04:39, violation: 0.00036299 07/21/2021 22:04:39, violation: 0.00035606 07/21/2021 22:04:40, violation: 0.00034932 07/21/2021 22:04:40, violation: 0.00034277 07/21/2021 22:04:41, violation: 0.00033640 07/21/2021 22:04:41, violation: 0.00033021 07/21/2021 22:04:41, violation: 0.00032421 07/21/2021 22:04:42, violation: 0.00031837 07/21/2021 22:04:42, violation: 0.00031270 07/21/2021 22:04:43, violation: 0.00030719 07/21/2021 22:04:43, violation: 0.00030184 07/21/2021 22:04:43, violation: 0.00029666 07/21/2021 22:04:44, violation: 0.00029162 07/21/2021 22:04:44, violation: 0.00028672 07/21/2021 22:04:45, violation: 0.00028195 07/21/2021 22:04:45, violation: 0.00027731 07/21/2021 22:04:45, violation: 0.00027281 07/21/2021 22:04:46, violation: 0.00026845 07/21/2021 22:04:46, violation: 0.00026420 07/21/2021 22:04:46, violation: 0.00026007 07/21/2021 22:04:47, violation: 0.00025607 07/21/2021 22:04:47, violation: 0.00025218 07/21/2021 22:04:48, violation: 0.00024841 07/21/2021 22:04:48, violation: 0.00024475 07/21/2021 22:04:48, violation: 0.00024118 07/21/2021 22:04:49, violation: 0.00023773 07/21/2021 22:04:49, violation: 0.00023437 07/21/2021 22:04:50, violation: 0.00023110 07/21/2021 22:04:50, violation: 0.00022792 07/21/2021 22:04:50, violation: 0.00022482 07/21/2021 22:04:51, violation: 0.00022178 07/21/2021 22:04:51, violation: 0.00021883 07/21/2021 22:04:52, violation: 0.00021595 07/21/2021 22:04:52, violation: 0.00021315 07/21/2021 22:04:52, violation: 0.00021041 07/21/2021 22:04:53, violation: 0.00020776 07/21/2021 22:04:53, violation: 0.00020517 07/21/2021 22:04:54, violation: 0.00020266 07/21/2021 22:04:54, violation: 0.00020022 07/21/2021 22:04:54, violation: 0.00019785 07/21/2021 22:04:55, violation: 0.00019554 07/21/2021 22:04:55, violation: 0.00019330 07/21/2021 22:04:56, violation: 0.00019111 07/21/2021 22:04:56, violation: 0.00018899 07/21/2021 22:04:56, violation: 0.00018692 07/21/2021 22:04:57, violation: 0.00018492 07/21/2021 22:04:57, violation: 0.00018297 07/21/2021 22:04:58, violation: 0.00018107 07/21/2021 22:04:58, violation: 0.00017923 07/21/2021 22:04:58, violation: 0.00017744 07/21/2021 22:04:59, violation: 0.00017570 07/21/2021 22:04:59, violation: 0.00017400 07/21/2021 22:05:00, violation: 0.00017235 07/21/2021 22:05:00, violation: 0.00017074 07/21/2021 22:05:00, violation: 0.00016917 07/21/2021 22:05:01, violation: 0.00016764 07/21/2021 22:05:01, violation: 0.00016615 07/21/2021 22:05:02, violation: 0.00016470 07/21/2021 22:05:02, violation: 0.00016328 07/21/2021 22:05:02, violation: 0.00016189 07/21/2021 22:05:03, violation: 0.00016054 07/21/2021 22:05:03, violation: 0.00015922 07/21/2021 22:05:04, violation: 0.00015794 07/21/2021 22:05:04, violation: 0.00015670 07/21/2021 22:05:04, violation: 0.00015548 07/21/2021 22:05:05, violation: 0.00015430 07/21/2021 22:05:05, ranks: 30, fitting error: 39.61248220278519
epi.pp.neighbors(adata)
epi.tl.umap(adata)
WARNING: .obsp["connectivities"] have not been computed using umap
epi.pl.umap(adata, color=['nb_features', 'log_nb_features', 'cell_type'], wspace=0.3)
epi.tl.louvain(adata)
epi.pl.umap(adata, color=['louvain', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.getNClusters(adata, n_cluster=14)
step 0 got 17 at resolution 1.5 step 1 got 12 at resolution 0.75 step 2 got 15 at resolution 1.125 step 3 got 14 at resolution 0.9375
epi.pl.umap(adata, color=['louvain', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.kmeans(adata, num_clusters=14)
epi.pl.umap(adata, color=['kmeans', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.hc(adata, num_clusters=14)
epi.pl.umap(adata, color=['hc', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.leiden(adata)
epi.pl.umap(adata, color=['leiden', 'cell_type', 'facs_label'], wspace=0.4)
epi.tl.getNClusters(adata, n_cluster=14, method='leiden')
step 0 got 18 at resolution 1.5 step 1 got 13 at resolution 0.75 step 2 got 16 at resolution 1.125 step 3 got 16 at resolution 0.9375 step 4 got 13 at resolution 0.84375 step 5 got 15 at resolution 0.890625 step 6 got 13 at resolution 0.8671875 step 7 got 15 at resolution 0.87890625 step 8 got 15 at resolution 0.873046875 step 9 got 15 at resolution 0.8701171875 step 10 got 13 at resolution 0.86865234375 step 11 got 13 at resolution 0.869384765625 step 12 got 15 at resolution 0.8697509765625 step 13 got 15 at resolution 0.86956787109375 step 14 got 15 at resolution 0.869476318359375 step 15 got 13 at resolution 0.8694305419921875 step 16 got 13 at resolution 0.8694534301757812 step 17 got 15 at resolution 0.8694648742675781 step 18 got 15 at resolution 0.8694591522216797 step 19 got 13 at resolution 0.8694562911987305
epi.pl.umap(adata, color=['leiden', 'cell_type', 'facs_label'], wspace=0.4)
# True labels
epi.pl.umap(adata, color=['cell_type', 'facs_label'], wspace=0.4)
# Clusters
epi.pl.umap(adata, color=['louvain', 'kmeans', 'hc', 'leiden'], wspace=0.4)
For all these methods. The best value possible is 1.
1) Compute the Adjusted Rand Index for the different clustering to determine which one perform best. It computes a similarity measure between two clusterings (predicted and true labels)by counting cells that are assigned in the same or different clusters between the two clusterings.
print('louvain:\t', epi.tl.ARI(adata, 'louvain', 'cell_type'))
print('kmeans:\t', epi.tl.ARI(adata, 'kmeans', 'cell_type'))
print('hc:\t', epi.tl.ARI(adata, 'hc', 'cell_type'))
print('leiden:\t', epi.tl.ARI(adata, 'leiden', 'cell_type'))
louvain: 0.5026830907128316 kmeans: 0.47398964189429343 hc: 0.4443018268622743 leiden: 0.494350768736209
2) Compute the Homogeneity score. The score is higher when the different clusters contain only cells with the same ground truth label
print('louvain:\t', epi.tl.homogeneity(adata, 'louvain', 'cell_type'))
print('kmeans:\t', epi.tl.homogeneity(adata, 'kmeans', 'cell_type'))
print('hc:\t', epi.tl.homogeneity(adata, 'hc', 'cell_type'))
print('leiden:\t', epi.tl.homogeneity(adata, 'leiden', 'cell_type'))
louvain: 0.620855120000249 kmeans: 0.607717394358658 hc: 0.6095090801233856 leiden: 0.6247759815798529
3) Compute the Adjusted Mutual Information, it is measure of the similarity between two labels of the same data, while accounting for chance (the Mutual information is generally higher for two set of labels with a large number of clusters)
print('louvain:\t', epi.tl.AMI(adata, 'louvain', 'cell_type'))
print('kmeans:\t', epi.tl.AMI(adata, 'kmeans', 'cell_type'))
print('hc:\t', epi.tl.AMI(adata, 'hc', 'cell_type'))
print('leiden:\t', epi.tl.AMI(adata, 'leiden', 'cell_type'))
louvain: 0.6756279946014332 kmeans: 0.6371966070741591 hc: 0.6380117508290446 leiden: 0.664802449730221
## Save the processed Anndata
adata.write(results_file)
adata = ad.read(results_file)
adata
AnnData object with n_obs × n_vars = 1722 × 122511
obs: 'batch', 'cell_name', 'facs_label', 'label', 'cell_type', 'nb_features', 'log_nb_features', 'louvain', 'kmeans', 'hc', 'leiden'
var: 'transcript_annotation', 'n_cells', 'commonness', 'prop_shared_cells', 'variability_score'
uns: 'cell_type_colors', 'facs_label_colors', 'hc_colors', 'kmeans_colors', 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'umap'
obsm: 'X_pca', 'X_umap'
obsp: 'connectivities', 'distances'
epi.tl.rank_features(adata, 'louvain', omic='ATAC')
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
epi.pl.rank_feat_groups(adata)
epi.pl.rank_feat_groups(adata, feature_symbols='transcript_annotation')
epi.pl.rank_feat_groups_matrixplot(adata)
WARNING: dendrogram data not found (using key=dendrogram_louvain). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.